{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# <div align=\"center\"> 通过代码实现加深对RNN的理解 </div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import itertools\n",
    "import operator\n",
    "import numpy as np\n",
    "import nltk\n",
    "import sys\n",
    "import os\n",
    "import time\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class data_preprocess(object):\n",
    "    def __init__(self):\n",
    "        self.vocabulary_size = 8000\n",
    "        self.unknown_token = 'UNKNOWN_TOKEN'\n",
    "        self.start_token = 'SENTENCE_START'\n",
    "        self.end_token = \"SENTENCE_END\"\n",
    "        \n",
    "    def process(self, path):\n",
    "        if os.path.exists('/tmp/Datasets/X_train.npy'):\n",
    "            return\n",
    "        else:\n",
    "            os.mkdir('/tmp/Datasets')\n",
    "        \n",
    "        with open(path, 'rt', encoding=\"utf-8\") as f:\n",
    "            reader = csv.reader(f, skipinitialspace=True)\n",
    "            reader.__next__() # next(reader)\n",
    "            # 每行存在多个句子\n",
    "            sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])\n",
    "            # 每个句子前后加token\n",
    "            sentences = [\"%s %s %s\" % (self.start_token, x, self.end_token) for x in sentences]\n",
    "            print(\"sentences length: %d\" % len(sentences))\n",
    "            \n",
    "            # 把句子细化为单词\n",
    "            tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]            \n",
    "            # 计算每个单词的频率\n",
    "            word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))\n",
    "            \n",
    "            # 选出前vocabulary_size个单词\n",
    "            vocab = word_freq.most_common(self.vocabulary_size-1)\n",
    "            index_to_word = [x[0] for x in vocab]\n",
    "            index_to_word.append(self.unknown_token)\n",
    "            word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])\n",
    "            print(\"Using vocabulary size %d.\" % len(vocab))\n",
    "            print(\"The least frequent word is '%s' and appeared %d times.\" % (vocab[-1][0], vocab[-1][1]))\n",
    "            \n",
    "            # 将原始句子中没有出现在vocab里的单词标注为unkown_token\n",
    "            for i, sent in enumerate(tokenized_sentences):\n",
    "                tokenized_sentences[i] = [w if w in word_to_index else self.unknown_token for w in sent]\n",
    "                \n",
    "            for sent in tokenized_sentences:\n",
    "                print(sent[:-1])\n",
    "                print(sent[1:])\n",
    "                break\n",
    "                \n",
    "            X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])\n",
    "            y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])\n",
    "            \n",
    "            # Save to Dist \n",
    "            np.save('/tmp/Datasets/index_to_word.npy', index_to_word)\n",
    "            np.save('/tmp/Datasets/word_to_index.npy', word_to_index)\n",
    "            np.save('/tmp/Datasets/X_train.npy', X_train)\n",
    "            np.save('/tmp/Datasets/y_train.npy', y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据集下载: [百度云盘Datasets](https://pan.baidu.com/s/1gAFZ9gSf4pHJBt5W6_PgPQ \"提取码: gxk4\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = data_preprocess()\n",
    "g.process('/home/lidong/Datasets/ML/reddit-comments-2015-08.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(79170,) (79170,) (8000,)\n",
      "[0, 6, 3495, 7, 155, 796, 25, 222, 8, 32, 20, 202, 4955, 350, 91, 6, 66, 207, 5, 2] \n",
      " [6, 3495, 7, 155, 796, 25, 222, 8, 32, 20, 202, 4955, 350, 91, 6, 66, 207, 5, 2, 1]\n",
      "['SENTENCE_START' 'i' 'joined' 'a' 'new' 'league' 'this' 'year' 'and'\n",
      " 'they' 'have' 'different' 'scoring' 'rules' 'than' 'i' \"'m\" 'used' 'to'\n",
      " '.']\n"
     ]
    }
   ],
   "source": [
    "X_train = np.load('/tmp/Datasets/X_train.npy', allow_pickle=True)\n",
    "y_train = np.load('/tmp/Datasets/y_train.npy', allow_pickle=True)\n",
    "index_to_word = np.load('/tmp/Datasets/index_to_word.npy', allow_pickle=True)\n",
    "\n",
    "print(X_train.shape, y_train.shape, index_to_word.shape)\n",
    "print(X_train[0], '\\n', y_train[0])\n",
    "print(index_to_word[X_train[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "VOCABULARY_SIZE = 8000\n",
    "\n",
    "# RNN\n",
    "class RNNNumpy:\n",
    "    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):\n",
    "        self.K = word_dim\n",
    "        self.H = hidden_dim\n",
    "        self.bptt_truncate = bptt_truncate\n",
    "        # 使用均匀分布初始化参数\n",
    "        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))\n",
    "        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))\n",
    "        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))\n",
    "        \n",
    "test_rnn = RNNNumpy(VOCABULARY_SIZE) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def softmax(x):\n",
    "    xt = np.exp(x - np.max(x))\n",
    "    return xt / np.sum(xt)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$s_t = tanh(Ux_t + W s_{t-1})$\n",
    "\n",
    "$o_t = softmax(Vs_t)$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 72, 63, 13, 124, 5, 26, 1128, 208, 5, 324, 3, 329, 4, 112, 32, 75, 7, 4746, 4, 8, 84, 52, 9, 7, 3155, 1021, 492, 7534, 8, 133, 48, 3096, 4, 10, 95, 51, 4, 128, 17, 37, 314, 577, 2, 40]\n",
      "['SENTENCE_START' 'no' 'one' 'is' 'going' 'to' 'be' 'honest' 'enough' 'to'\n",
      " 'run' 'the' 'check' ',' 'see' 'they' \"'re\" 'a' 'felon' ',' 'and' 'then'\n",
      " 'all' 'of' 'a' 'sudden' 'immediately' 'turn' 'dishonest' 'and' 'say' '``'\n",
      " 'nah' ',' 'you' 'know' 'what' ',' 'here' \"'s\" 'your' 'gun' 'anyway' '.'\n",
      " \"''\"]\n",
      "(45, 8000) (46, 100)\n"
     ]
    }
   ],
   "source": [
    "def forward_propagation(self, x):\n",
    "    # x 是一个句子\n",
    "    T = len(x)\n",
    "    # T+1:多一个ｓ_{-1}的初始状态\n",
    "    s = np.zeros((T+1, self.H))\n",
    "    o = np.zeros((T, self.K))\n",
    "    for t in np.arange(T):\n",
    "        s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))\n",
    "        o[t] = softmax(self.V.dot(s[t]))\n",
    "    return [o, s]\n",
    "\n",
    "RNNNumpy.forward_propagation = forward_propagation\n",
    "\n",
    "# 输入一个句子测试\n",
    "print(X_train[10])\n",
    "print(index_to_word[X_train[10]])\n",
    "o, s = test_rnn.forward_propagation(X_train[10])\n",
    "print(o.shape, s.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(45,)\n",
      "[1284 5221 7653 7430 1013 3562 7366 1874  224 6601 7299 6722 6892 3198\n",
      " 4480 5853 2926  261 4073 2371 6299 5376 4146 3761 7051 5981 1549 3765\n",
      " 4958 1835 6166 5192 2579 5879 4864 5132 6569 2800 2752 6821 4437 7021\n",
      " 3943 6912 3922]\n"
     ]
    }
   ],
   "source": [
    "def predict(self, x):\n",
    "    o, s = self.forward_propagation(x)\n",
    "    return np.argmax(o, axis = 1)\n",
    "\n",
    "RNNNumpy.predict = predict\n",
    "\n",
    "predictions = test_rnn.predict(X_train[10])\n",
    "print(predictions.shape)\n",
    "print(predictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "交叉熵损失函数:\n",
    "    \n",
    "$$\n",
    "L(y, o) = - \\frac{1}{N}\\sum_{n \\in N}y_n \\log o_n\n",
    "$$\n",
    "\n",
    "训练之前$o_i$出现的任意值的概率为$\\dfrac {1}{K}$, 即$K$个词概率均等, $y_n=1$则:\n",
    "$$\n",
    "\\begin{aligned}\n",
    "L(y, o) &= - \\frac{1}{N}\\sum_{n \\in N}y_n \\log \\frac{1}{K} \\\\\n",
    "    &= \\frac{1}{N} N \\log K \\\\\n",
    "    &= logK\n",
    "\\end{aligned}\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[2 3]\n",
      " [5 6]\n",
      " [8 9]]\n",
      "[5 9]\n"
     ]
    }
   ],
   "source": [
    "# slice 插曲:\n",
    "\n",
    "l1 = np.array([[1,2,3], [4,5,6], [7,8,9]])\n",
    "# 选取所有行的第1,2列\n",
    "print(l1[:, (1,2)])\n",
    "# 选取第1行第1列, 选取第2行第2列\n",
    "print(l1[(1,2), (1,2)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Expected Loss for random prediction: 8.987197\n",
      "Actual loss: 8.987393\n"
     ]
    }
   ],
   "source": [
    "def calculate_loss(self, xs, ys):\n",
    "    # xs: 一批训练数据\n",
    "    N = sum((len(y_i) for y_i in ys))\n",
    "    L = 0\n",
    "    # 遍历句子\n",
    "    for i in np.arange(len(ys)):\n",
    "        o, s = self.forward_propagation(xs[i])\n",
    "        # 选出标签y(正确下标)对应的概率\n",
    "        correct_word_predictions = o[np.arange(len(ys[i])), ys[i]]\n",
    "        L += -1 * sum(np.log(correct_word_predictions))\n",
    "    return L/N\n",
    "\n",
    "RNNNumpy.calculate_loss = calculate_loss\n",
    "\n",
    "print(\"Expected Loss for random prediction: %f\" % np.log(8000))\n",
    "print(\"Actual loss: %f\" % test_rnn.calculate_loss(X_train[:1000], y_train[:1000]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# V可以做空间映射将100(H) 映射到 8000(K)空间中\n",
    "\n",
    "def bptt(self, x, y):\n",
    "    # x句子词向量\n",
    "    T = len(y)\n",
    "    o, s = self.forward_propagation(x)\n",
    "    dLdU = np.zeros(self.U.shape)\n",
    "    dLdV = np.zeros(self.V.shape)\n",
    "    dLdW = np.zeros(self.W.shape)\n",
    "    delta_o = o\n",
    "    delta_o[np.arange(len(y)), y] -= 1  # it is y_hat - y\n",
    "    for t in np.arange(T):\n",
    "        dLdV += np.outer(delta_o[t], s[t].T)    # at time step t, shape is word_dim * hidden_dim\n",
    "        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))\n",
    "        # backpropagation through time (for at most self.bptt_truncate steps)\n",
    "        # given time step t, go back from time step t, to t-1, t-2, ...\n",
    "        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:\n",
    "            # print(\"Backprogation step t=%d bptt step=%d\" %(t, bptt_step))\n",
    "            dLdW += np.outer(delta_t, s[bptt_step - 1])\n",
    "            dLdU[:, x[bptt_step]] += delta_t\n",
    "            # update delta for next step\n",
    "            dleta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)\n",
    "    return [dLdU, dLdV, dLdW]\n",
    "\n",
    "RNNNumpy.bptt = bptt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def numpy_sgd_step(self, x, y, learning_rate):\n",
    "    dLdU, dLdV, dLdW = self.bptt(x, y)\n",
    "    self.U -= learning_rate * dLdU\n",
    "    self.V -= learning_rate * dLdV\n",
    "    self.W -= learning_rate * dLdW\n",
    "    \n",
    "RNNNumpy.sgd_step = numpy_sgd_step\n",
    "\n",
    "def train_with_sgd(model, X_train, y_train, learning_rate = 0.005, nepoch = 100, evaluate_loss_after = 5):\n",
    "    # keep track of the losses so that we can plot them later\n",
    "    losses = []\n",
    "    num_examples_seen = 0\n",
    "    for epoch in range(nepoch):\n",
    "        # optionally evaluate the loss\n",
    "        if (epoch % evaluate_loss_after == 0):\n",
    "            loss = model.calculate_loss(X_train, y_train)\n",
    "            losses.append((num_examples_seen, loss))\n",
    "            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')\n",
    "            print(\"%s: loss after num_examples_seen=%d epoch=%d: %f\" %(time, num_examples_seen, epoch, loss))\n",
    "            # adjust the learning rate if loss increases\n",
    "            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):\n",
    "                learning_rate = learning_rate * 0.5\n",
    "                print(\"setting learning rate to %f\" %(learning_rate))\n",
    "            sys.stdout.flush()\n",
    "        # for each training example...\n",
    "        for i in range(len(y_train)):\n",
    "            # one sgd step\n",
    "            model.sgd_step(X_train[i], y_train[i], learning_rate)\n",
    "            num_examples_seen += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2019-09-19 16:24:19: loss after num_examples_seen=0 epoch=0: 8.987280\n",
      "2019-09-19 16:24:35: loss after num_examples_seen=200 epoch=2: 8.942253\n",
      "2019-09-19 16:24:50: loss after num_examples_seen=400 epoch=4: 6.361804\n",
      "2019-09-19 16:25:05: loss after num_examples_seen=600 epoch=6: 5.978650\n",
      "2019-09-19 16:25:20: loss after num_examples_seen=800 epoch=8: 5.799763\n"
     ]
    }
   ],
   "source": [
    "train_with_sgd(test_rnn, X_train[:100], y_train[:100], nepoch = 10, evaluate_loss_after = 2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
